Word Count = 2236

Introduction

This report presents an exploratory data analysis (EDA) and predictive modeling for:

  1. Medical Insurance Charges Prediction using multiple regression techniques, LASSO regression, Boosting, Random Forest and Bayesian Additive Regression Trees (BART)

  2. Stroke Prediction using Step-wise logistic regression, LASSO, Linear Discriminant Analysis(LDA), Quadratic Discriminant Analysis (QDA) and Random Forest.

Install and Load Libraries

# Install packages if not installed (Once)
if (!require(tidyverse)) install.packages("tidyverse", dependencies=TRUE) # data manipulation
if (!require(car)) install.packages("car", dependencies=TRUE) # vif(); multicollinearity 
if (!require(tableone)) install.packages("tableone", dependencies=TRUE) # descriptive statistics
if (!require(psych)) install.packages("psych", dependencies=TRUE) # descriptive statistics
if (!require(caret)) install.packages("caret", dependencies=TRUE) # data splitting and model training
if (!require(dplyr)) install.packages("dplyr", dependencies=TRUE) # data manipulation
if (!require(GGally)) install.packages("GGally", dependencies=TRUE) # pairwise scatter plot
if (!require(boot)) install.packages("boot", dependencies=TRUE) # Bootstrapping and cv 
if (!require(corrplot)) install.packages("corrplot", dependencies=TRUE) # correlation matrix
if (!require(faraway)) install.packages("faraway", dependencies=TRUE) # for model diagnostics
if (!require(tidyr)) install.packages("tidyr", dependencies=TRUE) # data manipulation
if (!require(leaps)) install.packages("leaps", dependencies=TRUE) # variable selection, regsubsets
if (!require(MASS)) install.packages("MASS", dependencies=TRUE) # lda & qda 
if (!require(mgcv)) install.packages("mgcv", dependencies=TRUE) # gam()/ nonlinear models
if (!require(splines)) install.packages("splines", dependencies=TRUE) # splines for nonlinear models
if (!require(glmnet)) install.packages("glmnet", dependencies=TRUE) # L1 & L2, cv.glmnet() to automate lambda tuning 
if (!require(tree)) install.packages("tree", dependencies=TRUE) # decision tree
if (!require(gmodels)) install.packages("gmodels", dependencies=TRUE) # cross tabulation 
if (!require(randomForest)) install.packages("randomForest", dependencies=TRUE) #rf models
if (!require(gbm)) install.packages("gbm", dependencies=TRUE) # boosting
if (!require(BART)) install.packages("BART", dependencies=TRUE) # Bayesian Additive Regression Trees
if (!require(Matrix)) install.packages("Matrix", dependencies=TRUE) # sparse matrix for Lasso coefficients
if (!require(mice)) install.packages("mice", dependencies=TRUE) # imputation 
if (!require(pROC)) install.packages("pROC", dependencies=TRUE)  # For ROC curves


# Load libraries 
library(tidyverse) 
library(car)
library(tableone) 
library(psych)
library(caret) 
library(dplyr)
library(GGally)
library(boot) 
library(corrplot) 
library(faraway)
library(tidyr)
library(leaps)
library(MASS)
library(mgcv)
library(splines)
library(glmnet)
library(tree)
library(gmodels)
library(randomForest) 
library(gbm)  
library(BART) 
library(Matrix) 
library(mice)
library(pROC) 

Question 1: Medical Insurance Charges Prediction

Load the Data

df <- read.csv("/Users/20lau01/Documents/GitHub/Data Science Modelling/Assessment/medical_insurance.csv")

Exploratory Data Analysis (EDA)

# Understand the data
dim(df) 
summary(df) 
glimpse(df) 
View(df) 
sum(is.na(df)) 
# Factorise categorical variables
cols <- c("children", "sex", "smoker", "region")  
df[cols] <- lapply(df[cols], as.factor)

# Make sure charges, age and bmi are numeric 
df$charges <- as.numeric(df$charges)
df$age <- as.numeric(df$age)
df$bmi <- as.numeric(df$bmi)
##                      
##                       level     Overall              
##   n                                 1,338            
##   charges (mean (SD))           13,270.42 (12,110.01)
##   age (mean (SD))                   39.21 (14.05)    
##   bmi (mean (SD))                   30.66 (6.10)     
##   sex (%)             female          662 (49.5)     
##                       male            676 (50.5)     
##   smoker (%)          no             1064 (79.5)     
##                       yes             274 (20.5)     
##   children (%)        0               574 (42.9)     
##                       1               324 (24.2)     
##                       2               240 (17.9)     
##                       3               157 (11.7)     
##                       4                25 ( 1.9)     
##                       5                18 ( 1.3)     
##   region (%)          northeast       324 (24.2)     
##                       northwest       325 (24.3)     
##                       southeast       364 (27.2)     
##                       southwest       325 (24.3)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1122    4740    9382   13270   16640   63770

# Explore transformation for response variables charges
par(mfrow=c(1,1))
symbox(~charges, data=df, col="blue", pch=19) # Log transformation will be done for charges (y)

hist(log(df$charges)) # charges more normally distributed after log transformation

summary(log(df$charges)) # mean = 9.10, median = 9.15
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   7.023   8.464   9.147   9.099   9.720  11.063
# Create another data frame for the transformed charges and call it dff
dff <- df
dff$log_charges <- log(df$charges)
dff <- dff[ , !(colnames(dff) %in% "charges")]

numeric_vars <- df %>% dplyr::select(-c(smoker, region, sex, children)) # Define numeric variables
cor_matrix <- cor(numeric_vars)# Correlation matrix (numeric variables only)
cor_matrix
##           charges       age       bmi
## charges 1.0000000 0.2990082 0.1983410
## age     0.2990082 1.0000000 0.1092719
## bmi     0.1983410 0.1092719 1.0000000
# Pairwise relationships 
GGally::ggpairs(df) 

GGally::ggpairs(numeric_vars) # numeric variables only

# Examine numeric variable graphics more clearly
scatterplot(df$bmi, df$charges) # past a bmi 30, the charges increase significantly

scatterplot(df$age, df$charges) # no abrupt change in charges for age 

# Statistical testing (Significant results)
# T test for numerical x binary; H0 = the means of the two groups are equal 
# Assumes normality so dff and log charges will be used 
t.test(dff$log_charges ~ dff$smoker) # Reject H0

# Chi square test; H0: The variables are independent 
# smoker x sex 
with(df, CrossTable(sex, smoker, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS"))) # Reject H0, Statistically significant, association 

# ANOVA  
# bmi x region
aov4 <- aov(bmi ~ factor(region), data = dff)
summary(aov4)
# significant difference in bmi between the regions

# charges x children 
aov5 <- aov(log_charges ~ factor(children), data = dff)
summary(aov5)
# significant difference in charges between the number of children 

Discussion: Exploratory Data Analysis for Medical Insurance Charges Prediction

Exploratory analysis of the Medical Insurance data set provides important insights into the charges patterns and factors influencing them.

1. Data Quality & Structure

The data set is well-structured with no missing values. Chargesdata includes individual characteristics like Age, BMI and Sex as well as lifestyle indicators such as whether the individual is a Smoker or not, the geographical region of the individual, and the number of Children they have. These 6 explanatory variables from 1338 individuals may or may not help predict an individual’s medical insurance charges.

3. Bivariate analysis and statistical testing

Examining relationships between Charges and other variables show:

  • BMI has a low positive Pearson correlation with Charges (r = 0.13). BMI may not significantly influence medical insurance. However, there are assumption violations and further exploration is required as there could be non-linear relationships.Visual exploration suggest medical charges are higher for some individuals who have a BMI higher than 30.
  • Age showed moderate positive Pearson correlations (r = 0.53) with charges, following from scatterplot findings; older individuals in the data have higher medical insurance costs.
  • One sample T test revealed the mean difference of medical insurance changes between Smoker and non-smokers are significant, with those who are smokers having a higher average of medical Charges.
  • Sex does not appear to have a notable impact on Charges; one sample t test showed the mean difference of Charges between male and female were not statistically significant.
  • ANOVA highlight a statistically significant association between the number of Children a person has and their medical Charges.
  • Charges slightly vary between different Region. Grouped box plots indicate that those living in the Southeastregion tend to have slightly higher Charges. ANOVA showed the association between Region and Charges is not statistically significant.

4. Potential Concerns & Improvements

Several aspects of the data require attention before modeling:

  • Outliers in Charges: Some individuals have extremely high or low charges. Transforming the data may help reduce their impact, but it can distort the relationship between charges and predictors. The original variable will be used when fitting non-linear models.

  • Non-Linear Relationships: Relationship between Charges and BMI may not be linear as scatterplots suggest. Exploring polynomials and non-linear models like Random Forest may be beneficial.

  • Multicollinearity among predictors: Age with other predictors should be assessed, as it may affect model performance. Interactions like smoker:sex and bmi:region could also improve predictive performance, though their relevance requires cautious validation given earlier assumption violations in preliminary hypothesis testing.

5. Conclusion

Medical Insurance Charges of individuals are influenced by personal and lifestyle factors. The data set provides valuable predictors. Improvements can be made by addressing outliers and exploring non-linear relationships and interaction terms. This will help build models and render insights that achieve better predictive accuracy.

Regression Models

Split the data into training and testing sets

# The validation set approach benefits from computational efficiency, but falls short in terms of wasting the data. 10-fold cross-validation approach is more robust, as it uses all data points for training and testing, but is computationally more expensive. Validation set approach is used for more exploratory purposes. Moreover, for more stable performance estimates of statistical learning methods and for final model selection the 10 fold cross-validation approach is used. 10 fold is utilized as it strikes a practical balance in bias-variance trade-off and is a common choice in practice.

set.seed(123) # set seed for reproducibility 
trainIndexdff <- createDataPartition(dff$log_charges, p=0.8, list=FALSE, times=1)
train_datadff <- dff[trainIndexdff, ]
test_datadff <- dff[-trainIndexdff, ]

Multiple Linear Regression using Validation Set Approach

## 
## Call:
## lm(formula = log_charges ~ ., data = train_datadff)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.06118 -0.19719 -0.04629  0.07727  2.12656 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.0469214  0.0812944  86.684  < 2e-16 ***
## age              0.0339824  0.0009778  34.754  < 2e-16 ***
## sexmale         -0.0904965  0.0272236  -3.324 0.000917 ***
## bmi              0.0134105  0.0023427   5.724 1.35e-08 ***
## children1        0.1594599  0.0342543   4.655 3.65e-06 ***
## children2        0.2469590  0.0384403   6.424 2.00e-10 ***
## children3        0.2488788  0.0450825   5.521 4.25e-08 ***
## children4        0.4580122  0.1014335   4.515 7.03e-06 ***
## children5        0.3698519  0.1252153   2.954 0.003209 ** 
## smokeryes        1.5748528  0.0336888  46.747  < 2e-16 ***
## regionnorthwest -0.0840566  0.0393763  -2.135 0.033015 *  
## regionsoutheast -0.1602774  0.0387394  -4.137 3.79e-05 ***
## regionsouthwest -0.1256877  0.0396387  -3.171 0.001564 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4435 on 1059 degrees of freedom
## Multiple R-squared:  0.7707, Adjusted R-squared:  0.7681 
## F-statistic: 296.7 on 12 and 1059 DF,  p-value: < 2.2e-16
## RMSE of the full model: 8314.693

Multiple Linear Regression using 10-fold Cross-validation

# Baseline model for comparison
set.seed(123)
ctrl <- trainControl(method = "cv", number = 10)
mlr_model <- train(charges ~ ., 
                   data = df, 
                   method = "lm", 
                   trControl = ctrl)
summary(mlr_model)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11689.4  -2902.6   -943.7   1492.2  30042.7 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -11927.17     993.66 -12.003  < 2e-16 ***
## age                257.19      11.91  21.587  < 2e-16 ***
## bmi                336.91      28.61  11.775  < 2e-16 ***
## children1          390.98     421.35   0.928 0.353619    
## children2         1635.78     466.67   3.505 0.000471 ***
## children3          964.34     548.10   1.759 0.078735 .  
## children4         2947.37    1239.16   2.379 0.017524 *  
## children5         1116.04    1456.02   0.767 0.443514    
## sexmale           -128.16     332.83  -0.385 0.700254    
## smokeryes        23836.41     414.14  57.557  < 2e-16 ***
## regionnorthwest   -380.04     476.56  -0.797 0.425318    
## regionsoutheast  -1033.14     479.14  -2.156 0.031245 *  
## regionsouthwest   -952.89     478.15  -1.993 0.046483 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6059 on 1325 degrees of freedom
## Multiple R-squared:  0.7519, Adjusted R-squared:  0.7497 
## F-statistic: 334.7 on 12 and 1325 DF,  p-value: < 2.2e-16
mlr_rmse_cv <- mlr_model$results$RMSE # RMSE of the model
cat("RMSE of the model using 10-fold cross-validation:", mlr_rmse_cv, "\n")
## RMSE of the model using 10-fold cross-validation: 6083.203

Interpretation of Multiple Linear Regression

The model explains 75% of the variance in the data, with smoker being the most significant predictor.

The model indicates:

  • Smokers are predicted to have charges that are $23,836 higher than non-smokers, cet.par.
  • An additional year of age is associated with a $257 increase in charges on average, cet.par.
  • A one unit increase in bmi increases charges by $337 on average, cet.par.

Split Data into Training and Testing Sets for Non-Linear Models

set.seed(123)
trainIndex <- createDataPartition(df$charges, p=0.8, list=FALSE, times=1)
train_data <- df[trainIndex, ]
test_data <- df[-trainIndex, ]

Consideration of Interaction Terms

# Fit a model with interaction terms
interaction_model <- lm(log_charges ~ sex * smoker + age * bmi + age * smoker + bmi * smoker + region * smoker + bmi * region, data = train_datadff)
summary(interaction_model) 
## 
## Call:
## lm(formula = log_charges ~ sex * smoker + age * bmi + age * smoker + 
##     bmi * smoker + region * smoker + bmi * region, data = train_datadff)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.81074 -0.18724 -0.06788  0.07532  2.35683 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                7.083e+00  2.153e-01  32.894  < 2e-16 ***
## sexmale                   -1.164e-01  2.782e-02  -4.185 3.09e-05 ***
## smokeryes                  1.149e+00  1.834e-01   6.265 5.43e-10 ***
## age                        4.443e-02  4.607e-03   9.643  < 2e-16 ***
## bmi                        7.676e-03  7.002e-03   1.096   0.2732    
## regionnorthwest           -2.232e-01  1.885e-01  -1.184   0.2368    
## regionsoutheast            1.480e-01  1.779e-01   0.832   0.4056    
## regionsouthwest           -1.851e-01  1.924e-01  -0.962   0.3362    
## sexmale:smokeryes          1.164e-01  6.243e-02   1.864   0.0626 .  
## age:bmi                   -9.966e-05  1.456e-04  -0.684   0.4939    
## smokeryes:age             -3.259e-02  2.206e-03 -14.777  < 2e-16 ***
## smokeryes:bmi              4.979e-02  5.275e-03   9.437  < 2e-16 ***
## smokeryes:regionnorthwest  7.941e-02  8.998e-02   0.883   0.3777    
## smokeryes:regionsoutheast  8.579e-02  8.536e-02   1.005   0.3151    
## smokeryes:regionsouthwest  1.695e-01  9.205e-02   1.841   0.0659 .  
## bmi:regionnorthwest        4.839e-03  6.299e-03   0.768   0.4426    
## bmi:regionsoutheast       -9.360e-03  5.554e-03  -1.685   0.0923 .  
## bmi:regionsouthwest        5.363e-04  6.244e-03   0.086   0.9316    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4028 on 1054 degrees of freedom
## Multiple R-squared:  0.8118, Adjusted R-squared:  0.8088 
## F-statistic: 267.4 on 17 and 1054 DF,  p-value: < 2.2e-16
# Significant: 
# smoker: age 
# smoker: bmi

# bmi:regionsoutheast is significant (.02) but other regions are not and EDA also suggests
# that we should ignore this interaction

Visualise Key Interactions Non-linearly

Multiple Linear Regression with Interaction Terms

set.seed(123)
ctrl <- trainControl(method = "cv", number = 10)  

# Fit a multiple linear regression model with interaction terms
mlr_modelIT <- train(charges ~ .+ smoker:age + smoker:bmi, 
                   data = df, 
                   method = "lm", 
                   trControl = ctrl)
# RMSE of the model
mlr_rmse_cvIT <- mlr_modelIT$results$RMSE 
cat("RMSE of the model with interaction terms using 10-fold cross-validation:", mlr_rmse_cvIT, "\n")
## RMSE of the model with interaction terms using 10-fold cross-validation: 4853.675

Best Subset Selection (BREG) using Validation Set Approach

## Best number of variables according to adjusted R-squared: 11
## Best number of variables according to Cp: 10

## Best number of variables according to BIC: 3

# Coefficients of the best model according to BIC
coef(regfit.full, 3) 
##   (Intercept)           age     smokeryes bmi:smokeryes 
##    -1934.3416      260.9886   -21353.7046     1477.9478
## RMSE of the best subset model using BIC: 5256.636

Best Subset Selection (BREG) using 10-fold Cross-validation

## Best model using 1-SE rule: 2

## RMSE of the best subset model using 1-SE rule: 5239.712
##   (Intercept)           age bmi:smokeryes 
##    -2252.6938      263.8284      818.4120

LASSO with interaction terms using 10-fold Cross-validation

# Scale the numeric columns of the data using mean and standard deviation
set.seed(1)
numeric_columns <- names(df)[sapply(df, is.numeric)]
means <- colMeans(df[, numeric_columns])
sds <- apply(df[, numeric_columns], 2, sd)

# Create a data frame for scaled data
df_scaled <- df
df_scaled[, numeric_columns] <- sweep(df[, numeric_columns], 2, means, "-")
df_scaled[, numeric_columns] <- sweep(df_scaled[, numeric_columns], 2, sds, "/")
## 
## Call:  cv.glmnet(x = X, y = y, nfolds = 10, alpha = 1) 
## 
## Measure: Mean-Squared Error 
## 
##      Lambda Index Measure       SE Nonzero
## min 0.00204    65  0.1625 0.009744      13
## 1se 0.04829    31  0.1717 0.010313       3
## Optimal lambda using 10-fold cross-validation: 0.002042173
## 15 x 1 sparse Matrix of class "dgCMatrix"
##                 unstandardized_coef
## (Intercept)            -1.268029852
## age                     0.021713234
## bmi                     0.001434399
## children1               0.016241252
## children2               0.115133355
## children3               0.071736829
## children4               0.262040891
## children5               0.124442091
## sexmale                -0.036249941
## smokeryes               1.962638567
## regionnorthwest        -0.034215475
## regionsoutheast        -0.082548719
## regionsouthwest        -0.085481037
## age:smokeryes           .          
## bmi:smokeryes           0.722569448
## RMSE of the LASSO model with interaction terms using 10-fold cross-validation: 4816.436

Tree based methods

Decision Tree

## 
## Regression tree:
## tree(formula = charges ~ ., data = train_data)
## Variables actually used in tree construction:
## [1] "smoker" "age"    "bmi"   
## Number of terminal nodes:  5 
## Residual mean deviance:  22920000 = 2.446e+10 / 1067 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -8907.0 -2920.0  -958.4     0.0  1013.0 24390.0

## RMSE of the Decision Tree model: 5316.753

## RMSE of the Pruned Decision Tree model: 6313.892
## Selected cp using 1-SE rule: 0.06
## Final RMSE of the Decision Tree model using 10-fold cross-validation: 5082.757

Boosting

Boosting is used as another method for variable selection as unlike LASSO and best subset selection, it can model non-linear relationships and capture interactions well.

## RMSE of the Boosting model: 4993.59

Boosting on 10-fold Cross-validation

## RMSE of the Boosting model using 10-fold cross-validation: 4480.496

Random Forest

## RMSE of the Random Forest model: 4665.599

Random Forest with Interaction Term

## RMSE of the Random Forest model with interaction terms: 4596.214

The importance differs slightly as the interaction terms are treated as more important. Age is treated as rather than smoker when the interactions are considered explicitly.

Bayesian Additive Regression Trees (BART) using Validation Set Approach

Bayesian Additive Regression Trees (BART) using 10-fold Cross-validation

set.seed(123)
k <- 10
folds <- sample(1:k, nrow(df), replace = TRUE)
rmse_list <- numeric(k)

for (i in 1:k) {
  train_idx <- which(folds != i)
  test_idx <- which(folds == i)
  
  bart_cvmodel <- gbart(x[train_idx, ], y[train_idx], 
                 x.test = x[test_idx, ],
                 ntree = 50, k = 2)
  
  rmse_list[i] <- sqrt(mean((y[test_idx] - bart_cvmodel$yhat.test.mean)^2))
}
## 10-Fold CV RMSE (gbart): 4446.968

Model Comparison

## Best Performing Model (RMSE): BART
ggplot(model_performance, aes(x = reorder(Model, -RMSE), y = RMSE, fill = (Model == best_model))) +
  geom_bar(stat = "identity", alpha = 0.7) +
  scale_fill_manual(values = c("gray", "darkorange")) +
  geom_text(aes(label = round(RMSE, 2)), vjust = -0.5, size = 4, fontface = "bold") +
  labs(
    title = "Model Performance Comparison (RMSE)",
    y = "Mean Squared Error (RMSE)",
    x = "Model"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

Discussion: Interpreting Model Performance & Improvements

After comparing the regression models using Root Mean Squared Error (RMSE), we can make several observations about the predictive performance of each model for medical insurance charges.

  • Smoker is the most important in predicting charges when examining main effects alone, followed by age and bmi.

  • When interactions are considered, the models suggest age and smoker:bmi may be more important than smoker alone.

Which Model Performed Best?

  • The best-performing model, with the lowest RMSE is highlighted in orange in the final plot.

  • Tree based models (e.g.Boosting, Random Forest and BART) outperformed linear models, though linear models with interaction terms surpassed decision trees and had a lower RMSE than basic linear models. This suggests non-linear relationships in the data, making simple linear model inadequate.

  • BART had the lowest RMSE, this suggests that there may be more complex relationships in the data such as higher order interactions e.g. between smoker, age and BMI and other variables.

Key Insights from Model Performance

  • Multivariate Linear Regression: Provides a simple baseline model, but simple least squares is too flexible, over fits the data and interactions or non-linear trends may not be captured.

  • Regression with Interaction Terms: Helps capture dependencies between predictors, but increased model complexity, does not always improve accuracy since simple least squares still over fits the data.

  • Polynomial Regression: Did not effectively capture non-linear relationships and suggested that relationships are not smooth global curves.

  • Best Subset Regression(BREG) with Interaction terms:

    • Using the 1-SE rule for variable selection, the model selected age and smoker:bmi as the top predictors. Smoking impacts charges significantly varies with BMI (and vice versa). The joint effect explains for more variance than either alone.
    • When optimized for cross validation error rather than variable selection, the model achieved a slightly lower RMSE.
  • LASSO: Provided marginal improvement over BREG. Weak lambda_min penalty resulted in a model too weak to enforce sparsity (many variables retained) and high variance (potential noise). The least important variable was age:smoker.

  • Random Forest: Achieved strong predictive accuracy, ranking second only to BART. It outperforms both decision trees and boosting methods.


Final Summary and Improvements

Tree Based Methods had the lowest MSE:
→ Medical Charges depend on complex non-linear, non-additive relationships.

Next Steps:

- It would have been reasonable to explore step functions e.g. for bmi over 30 as there may be a threshold effect. For instance, smokers who have a bmi over 30 likely have higher medical charges.
- Improve feature engineering, remove collinear variables, and better model smoker*bmi.
- Explore unsupervised methods e.g. cluster analysis to better understand bmi:smoker.
- Try advanced models like XGBoost and Hybrid Models to trade interpretability for improved accuracy.

This analysis provides a strong foundation for medical charges forecasting, but further refinements could lead to even better predictive performance.


Appendix

Statistical Tests with no statistical significance
# T-tests
t.test(dff$log_charges ~ dff$sex) 
t.test(dff$age ~ dff$smoker) 
t.test(dff$age ~ dff$sex) 

# Chi-squared tests
# Region x smoker 
with(df, CrossTable(region, smoker, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Close to 0.05, but not statistically significant

# Region x children
with(df, CrossTable(region, children, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))

# Sex x region
with(df, CrossTable(sex, region, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant

# Children x sex 
with(df, CrossTable(sex, children, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant

# Children x smoker
with(df, CrossTable(smoker, children, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, but note there are less than 5 expected counts in some cells

# ANOVA
# age x children
aov1 <- aov(age ~ factor(children), data = dff)
summary(aov1)
par(mfrow=c(2,2))
plot(aov1) # check residuals to decide on weight of resulting H0 conclusion  
# no significant difference in age between the number of children

# age x region
aov2 <- aov(age ~ factor(region), data = dff)
summary(aov2)
par(mfrow=c(2,2))
plot(aov2)
# no significant difference in age between the regions

# bmi x children
aov3 <- aov(bmi ~ factor(children), data = dff)
summary(aov3)
par(mfrow=c(2,2))
plot(aov3)
# no significant difference in bmi between the number of children

# charges x region 
aov6 <- aov(log_charges ~ factor(region), data = dff)
summary(aov6)
par(mfrow=c(2,2))
plot(aov6)
# no significant difference in charges between the different regions 
Polynomial Regression
# Explore Polynomials
fit1 <- lm(charges ~ age, data = df)
fit2 <- lm(charges ~ poly(age, 2), data = df)
fit3 <- lm(charges ~ poly(age, 3), data = df)
fit4 <- lm(charges ~ poly(age, 4), data = df)

AIC(fit1, fit2, fit3, fit4) 
anova(fit1, fit2, fit3, fit4) 

# Complex models hasn't improved the model much, linear model is simpler

fit1 <- lm(charges ~ bmi, data = df)
fit2 <- lm(charges ~ poly(bmi, 2), data = df)
fit3 <- lm(charges ~ poly(bmi, 3), data = df)
fit4 <- lm(charges ~ poly(bmi, 4), data = df)

AIC(fit1, fit2, fit3, fit4)
anova(fit1, fit2, fit3, fit4)
# Polynomial does not seem beneficial 

Question 2: Stroke Prediction

Load data

df2 <- read.csv("/Users/20lau01/Documents/GitHub/Data Science Modelling/Assessment/stroke_prediction.csv")

Exploratory Data Analysis

# Explore dataset
dim(df2) 
summary(df2)
glimpse(df2)
View(df2)
##  [1] "gender"          "age"             "hypertension"    "heartdisease"   
##  [5] "evermarried"     "worktype"        "residencetype"   "avgglucoselevel"
##  [9] "bmi"             "smokingstatus"   "stroke"
# Understand Missing
# Age 
min(df2$age)
sum(df2$age == 0.08) 
sum(is.na(df2$age)) # Check for missing values

# BMI
sum(df2$bmi== "N/A")
sum(is.na(df2$bmi))
class(df2$bmi)

# Smoking 
sum(df2$smokingstatus == "Unknown")
sum(is.na(df2$smokingstatus)) 

# Check total missing values
sum(is.na(df2)) 
df2$age[df2$age == 0.08] <- NA # Replace 0.08 with NA
df2$bmi[df2$bmi == "N/A"] <- NA # Replace N/A with NA
df2$smokingstatus[df2$smokingstatus == "Unknown"] <- NA

# Factorise 
cols <- c("stroke", "gender", "evermarried", "hypertension", "heartdisease", "worktype", "residencetype", "smokingstatus") 
df2[cols] <- lapply(df2[cols], as.factor)

# Recode binary variables 
levels(df2$heartdisease) <- c("No", "Yes")
levels(df2$stroke) <- c("No", "Yes")

# Reorder levels for smoking status
df2$smokingstatus <- factor(df2$smokingstatus, levels = c("never smoked", "formerly smoked", "smokes"), ordered = TRUE)

# Remove the One "Other" from gender for simplicity for modelling 
df2 <- df2[df2$gender != "Other",]
df2$gender <- droplevels(df2$gender)
df2$gender <- relevel(df2$gender, ref = "Male")  # Reorder level for gender                 
## stroke
##   No  Yes 
## 4860  249 
## evermarried
##   No  Yes 
## 1756 3353 
## hypertension
##    0    1 
## 4611  498 
## heartdisease
##   No  Yes 
## 4833  276 
## residencetype
## Rural Urban 
##  2513  2596 
## gender
##   Male Female 
##   2115   2994 
## worktype
##      children      Govt_job  Never_worked       Private Self-employed 
##           687           657            22          2924           819 
## smokingstatus
##    never smoked formerly smoked          smokes 
##            1892             884             789

Impute Missing

Descriptive Analysis

Univariate Analysis

##                              
##                               level           Overall       
##   n                                            5,109        
##   stroke (%)                  No                4860 (95.1) 
##                               Yes                249 ( 4.9) 
##   bmi (mean (SD))                              28.89 (7.85) 
##   age (mean (SD))                              43.23 (22.61)
##   avgglucoselevel (mean (SD))                 106.14 (45.29)
##   evermarried (%)             No                1756 (34.4) 
##                               Yes               3353 (65.6) 
##   hypertension (%)            0                 4611 (90.3) 
##                               1                  498 ( 9.7) 
##   heartdisease (%)            No                4833 (94.6) 
##                               Yes                276 ( 5.4) 
##   residencetype (%)           Rural             2513 (49.2) 
##                               Urban             2596 (50.8) 
##   worktype (%)                children           687 (13.4) 
##                               Govt_job           657 (12.9) 
##                               Never_worked        22 ( 0.4) 
##                               Private           2924 (57.2) 
##                               Self-employed      819 (16.0) 
##   smokingstatus (%)           never smoked      2844 (55.7) 
##                               formerly smoked   1195 (23.4) 
##                               smokes            1070 (20.9) 
##   gender (%)                  Male              2115 (41.4) 
##                               Female            2994 (58.6)

Bivariate Analysis

Relationship Visualizations
cor_matrix2 <- cor(complete_data[, c("age", "bmi", "avgglucoselevel")])
corrplot(cor_matrix2, method="color", tl.col="black", tl.srt=45)
GGally::ggpairs(complete_data[, c("age", "bmi", "avgglucoselevel")]) # note the violations in assumptions of pearson correlation (variables arent normal)

# grouped boxplots 
par(mfrow=c(1,3))
boxplot(complete_data$age ~ complete_data$stroke, col = c("#1f77b4", "#ff7f0e"), main = "Boxplot of Age by Stroke Status")
boxplot(complete_data$bmi ~ complete_data$stroke, col = c("#1f77b4", "#ff7f0e"), main = "Boxplot of BMI by Stroke Status")
boxplot(complete_data$avgglucoselevel ~ complete_data$stroke, col = c("#1f77b4", "#ff7f0e"), main = "Boxplot of Avg Glucose Level by Stroke Status")

T-tests
t.test(complete_data$age ~ complete_data$stroke) # Reject H0, Statistically significant
## 
##  Welch Two Sample t-test
## 
## data:  complete_data$age by complete_data$stroke
## t = -29.678, df = 331.61, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -27.45537 -24.04196
## sample estimates:
##  mean in group No mean in group Yes 
##          41.97953          67.72819
t.test(complete_data$bmi ~ complete_data$stroke) # Reject H0, Statistically significant
## 
##  Welch Two Sample t-test
## 
## data:  complete_data$bmi by complete_data$stroke
## t = -3.6374, df = 237.84, p-value = 0.0003377
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -2.5387991 -0.7549231
## sample estimates:
##  mean in group No mean in group Yes 
##          28.82443          30.47129
t.test(complete_data$avgglucoselevel ~ complete_data$stroke) # Reject H0, Statistically significant
## 
##  Welch Two Sample t-test
## 
## data:  complete_data$avgglucoselevel by complete_data$stroke
## t = -6.9844, df = 260.9, p-value = 2.373e-11
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -35.58269 -19.93162
## sample estimates:
##  mean in group No mean in group Yes 
##          104.7876          132.5447
Chi-square Tests
# Stroke x ever married
with(complete_data, CrossTable(evermarried, stroke, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
# Association between stroke and ever married

# Stroke x Hypertension
with(complete_data, CrossTable(hypertension, stroke, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant 
# Association between stroke and hypertension

# Stroke x heart disease
with(complete_data, CrossTable(heartdisease, stroke, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant 
# Association between stroke and heart disease

# Stroke associated with work type
with(complete_data, CrossTable(worktype, stroke, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant 
# Association between stroke and work type
# Expected value for never worked and has stroke is under 5, chi square of independence assumption IS VIOLATED

# Stroke associated with smoking status?
with(complete_data, CrossTable(smokingstatus, stroke, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
# Association between stroke and smoking status
Multi-collinearity Checks

# T test
t.test(complete_data$age ~ complete_data$gender, var.equal = TRUE) # p = 0.05, Reject H0- statistically significant but age: gender may be useless in models 
t.test(complete_data$age ~ complete_data$hypertension, var.equal = TRUE) # Reject H0, Statistically significant
t.test(complete_data$age ~ complete_data$heartdisease, var.equal = TRUE) # Reject H0, Statistically significant
t.test(complete_data$age ~ complete_data$evermarried, var.equal = TRUE) # Reject H0, Statistically significant

t.test(complete_data$bmi ~ complete_data$hypertension, var.equal = TRUE) # Reject H0, Statistically significant
t.test(complete_data$bmi ~ complete_data$heartdisease, var.equal = TRUE) # Reject H0, Statistically significant
t.test(complete_data$bmi ~ complete_data$evermarried, var.equal = TRUE) # Reject H0, Statistically significant

t.test(complete_data$avgglucoselevel ~ complete_data$gender, var.equal = TRUE) # Reject H0, Statistically significant
t.test(complete_data$avgglucoselevel ~ complete_data$hypertension, var.equal = TRUE) # Reject H0, Statistically significant
t.test(complete_data$avgglucoselevel ~ complete_data$heartdisease, var.equal = TRUE) # Reject H0, Statistically significant
t.test(complete_data$avgglucoselevel ~ complete_data$evermarried, var.equal = TRUE) # Reject H0, Statistically significant
# Chi-square tests 
with(complete_data, CrossTable(worktype, gender, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant

with(complete_data, CrossTable(evermarried, hypertension, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant

with(complete_data, CrossTable(hypertension, heartdisease, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant-> verifies what we suspected from ggpairs 

with(complete_data, CrossTable(smokingstatus, heartdisease, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant

with(complete_data, CrossTable(hypertension, worktype, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant -> less than 5 on expected values 

with(complete_data, CrossTable(worktype, evermarried, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant

with(complete_data, CrossTable(residencetype, smokingstatus, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant

with(complete_data, CrossTable(heartdisease, worktype, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant

with(complete_data, CrossTable(evermarried, heartdisease, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant

with(complete_data, CrossTable(heartdisease, gender, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant

with(complete_data, CrossTable(smokingstatus, gender, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant

with(complete_data, CrossTable(worktype, smokingstatus, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant

with(complete_data, CrossTable(gender, evermarried, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant

Discussion: Key Insights from Exploratory Data Analysis

Exploratory analysis of the Stroke data set revealed several important patterns and potential data quality issues:

1. Data Quality Issues

  • The data set does not contain explicit missing values, but BMI has missing data and the “Unknown” level in smokingstatus is treated as missing.
  • These zero values are treated as missing and replaced using imputation methods. As BMI and Age are numerical variables, predictive mean matching imputation is used as it is a common method that is robust for numerical variables. Random Forest imputation for smokingstatus is used as the Stroke data set contains a mix of categorical and numerical variables and prioritizes accuracy and accommodates for more complex relationships .
  • For gender, to ensure consistent levels, Other was removed as there is only one observation.

2. Class Imbalance & Data Distribution

  • The data set contains more non-stroke cases (stroke = 0) than stroke cases (stroke = 1), which can impact model performance.

  • Histograms indicate extreme skewness in avgglucoselevel and BMI, which may need to be log transformed as model performance can suffer.

3. Data Pre-processing Needs

  • Scaling numeric predictors age, avgglucoselevel, and BMI for LDA/QDA or regularized logistic regression would improve model performance

4. Bivariate analysis

  • Those who have stroke have are typically older in age and fall into generally two clusters of avgglucoselevel

  • The average bmi of those who have stroke is 30 as opposed to 28 who do not have stroke, whereas the average age of those who have stroke is 67 as opposed to 41 who do not have. The avgglucoselevel of those who have stroke is 132 as opposed to 104 who do not.

  • Chi-squared test indicated there is no statistically significant association between stroke with gender or residencetype, but there is a statistically significant association between stroke and evermarried, hypertension, heartdisease, worktype, and smokingstatus.

  • Grouped scatter plots and multi-collinearity statistical tests indicate that various interaction terms will need to be considered when modelling as basic logistic regression models would suffer from high multicollinearity. The following interactions require further analysis as to their significance:

    • age:hypertension, age:heartdisease, age:evermarried, age:averageglucoselevel

    • bmi:hypertension, bmi:evermarried, bmi:heartdisease

    • avgglucoselevel:hypertension, avgglucoselevel:heartdisease, avgglucoselevel:evermarried

    • worktype:evermarried, worktype:heartdisease, worktype:hypertension, worktype:smokingstatus

    • smokingstatus:evermarried, smokingstatus:heartdisease

    • hypertension:evermarried, hypertension:heartdisease


Machine Learning Models for Stroke Prediction

This section applies Logistic Regression, Linear Discriminant Analysis(LDA) , Quadratic Discriminant Analysis(QDA) and Random Forest to predict whether a patient has stroke.

Logistic Regression to test significance of interactions

# For plausible interactions, test individually:
# Note the following are those that are significant, for interactions that have been tested and were not significant- please see the appendix 

fit_int2 <- glm(stroke ~ age*heartdisease, data=complete_data, family = "binomial")
summary(fit_int2)
# statistically significant

fit_int5 <- glm(stroke ~ bmi*hypertension, data=complete_data, family = "binomial")
summary(fit_int5)
# statistically significant

fit_int7 <- glm(stroke ~ bmi*evermarried, data=complete_data, family = "binomial")
summary(fit_int7)
# statistically significant, interaction more significant than bmi main effect

fit_int8 <- glm(stroke ~ avgglucoselevel*hypertension, data=complete_data, family = "binomial")
summary(fit_int8)
# statistically significant

fit_int11 <- glm(stroke ~ worktype*evermarried, data=complete_data, family = "binomial")
summary(fit_int11)
# statistically significant for worktypePrivate:evermarriedYes  (but not for others)

fit_int16 <- glm(stroke ~ smokingstatus*heartdisease, data=complete_data, family = "binomial")
summary(fit_int16)
# statistically significant (threshold of 0.05)

fit_int17 <- glm(stroke ~ hypertension*evermarried, data=complete_data, family = "binomial")
summary(fit_int17)
# statistically significant

fit_int18 <- glm(stroke ~ hypertension*heartdisease, data=complete_data, family = "binomial")
summary(fit_int18)
# statistically significant

Discussion: Key Insights from Logistic Regression

  • The following two-way interactions were statistically significant and could be given further consideration:
    • age:heartdisease, bmi:hypertension,bmi:evermarried
    • avgglucoselevel:hypertension, smokingstatus:heartdisease
    • hypertension:evermarried, hypertension:heartdisease

Stepwise Logistic Regression with Interaction Terms

## 
## Call:
## glm(formula = stroke ~ age + hypertension + heartdisease + avgglucoselevel + 
##     smokingstatus + evermarried + age:heartdisease + hypertension:evermarried + 
##     hypertension:heartdisease, family = "binomial", data = complete_data)
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -8.207324   0.485267 -16.913  < 2e-16 ***
## age                            0.070237   0.006192  11.343  < 2e-16 ***
## hypertension1                  1.737507   0.496452   3.500 0.000466 ***
## heartdiseaseYes                2.633869   1.401689   1.879 0.060235 .  
## avgglucoselevel                0.004925   0.001260   3.908 9.29e-05 ***
## smokingstatusformerly smoked   0.117366   0.172033   0.682 0.495093    
## smokingstatussmokes            0.421086   0.197419   2.133 0.032928 *  
## evermarriedYes                 0.230015   0.308296   0.746 0.455616    
## age:heartdiseaseYes           -0.029095   0.019625  -1.483 0.138196    
## hypertension1:evermarriedYes  -1.192216   0.517820  -2.302 0.021314 *  
## hypertension1:heartdiseaseYes -0.671801   0.444982  -1.510 0.131113    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1728.3  on 4907  degrees of freedom
## Residual deviance: 1360.7  on 4897  degrees of freedom
## AIC: 1382.7
## 
## Number of Fisher Scoring iterations: 8
##                           age                 hypertension1 
##                      95.65685                     100.94144 
##               heartdiseaseYes               avgglucoselevel 
##                     453.79200                      15.37840 
##  smokingstatusformerly smoked           smokingstatussmokes 
##                      25.66308                      31.21740 
##                evermarriedYes           age:heartdiseaseYes 
##                     105.72878                     427.46181 
##  hypertension1:evermarriedYes hypertension1:heartdiseaseYes 
##                      99.41058                      11.34878

Discussion: Key Insights from Stepwise Logistic Regression

  • Stepwise logistic regression selected the following as significant predictors:
    • age, hypertension, heartdisease, avgglucoselevel, smokingstatus, evermarried, and bmi.

    • Interaction terms age:heartdisease, smokingstatus:heartdisease, and hypertension:evermarried were also selected

    • Interactions that were not statistically significant were omitted.

Validate Stepwise Logistic Regression with 10-fold Cross Validation

##                                    (Intercept) 
##                                   -8.161300874 
##                                            age 
##                                    0.070380992 
##                                  hypertension1 
##                                    1.591438888 
##                                heartdiseaseYes 
##                                    2.202423520 
##                                avgglucoselevel 
##                                    0.004988396 
##                 `smokingstatusformerly smoked` 
##                                    0.070818750 
##                            smokingstatussmokes 
##                                    0.310328425 
##                                 evermarriedYes 
##                                    0.238959131 
##                          `age:heartdiseaseYes` 
##                                   -0.028988471 
## `heartdiseaseYes:smokingstatusformerly smoked` 
##                                    0.266659370 
##          `heartdiseaseYes:smokingstatussmokes` 
##                                    0.513755102 
##                 `hypertension1:evermarriedYes` 
##                                   -1.181582964

Interpretation of the validated Stepwise Logistic Regression (Hybrid)

  • A one year increase in age, increases the log odds of having a stroke by 0.07132, ceteris paribus.

  • For one mg/dl increase in avgglucoselevel, the log odds of having a stroke increase by 0.00425, ceteris paribus.

  • For those with hypertension, the log odds of having a stroke is 1.34583 times higher than those without hypertension, ceteris paribus.

  • For those with heartdisease, the log odds of having a stroke is 2.08845 times higher than those without heart disease, ceteris paribus.

  • For those who formerly smoked, the log odds of having a stroke is 0.12441 times higher than those who never smoked, ceteris paribus.

  • For those who smoke, the log odds of having a stroke is 0.10365 times higher than those who never smoked, ceteris paribus.

  • For those who have evermarried, the log odds of having a stroke is 0.07084 times higher than those who never been married, ceteris paribus.

Predictions and Model Evaluation

## Area under the curve: 0.8434

LASSO Regression

## Best lamda (smallest cv-error): 0.002177595
## 23 x 1 sparse Matrix of class "dgCMatrix"
##                                              unstandardized_coef
## (Intercept)                                         -7.330086028
## genderFemale                                         .          
## age                                                  0.063732267
## hypertension1                                        0.544887569
## heartdiseaseYes                                      0.194898669
## evermarriedYes                                       .          
## worktypeGovt_job                                     .          
## worktypeNever_worked                                 .          
## worktypePrivate                                      0.050503883
## worktypeSelf-employed                               -0.194880821
## residencetypeUrban                                   .          
## avgglucoselevel                                      0.004392215
## bmi                                                  .          
## smokingstatusformerly smoked                         .          
## smokingstatussmokes                                  0.049396449
## age:heartdiseaseYes                                  .          
## hypertension1:bmi                                    .          
## evermarriedYes:bmi                                   .          
## hypertension1:avgglucoselevel                        .          
## heartdiseaseYes:smokingstatusformerly smoked         0.199882587
## heartdiseaseYes:smokingstatussmokes                  0.668436163
## hypertension1:evermarriedYes                         .          
## hypertension1:heartdiseaseYes                       -0.259171998

Interpretation of the LASSO Regression with Interactions

  • For a one year increase in age, the log odds of having a stroke increases by 0.064, ceteris paribus.

  • For a one mg/dl increase in avgglucoselevel, the log odds of having a stroke increase by 0.00357, ceteris paribus.

  • For those with hypertension, the log odds of having a stroke is 0.545 units higher compared to those without hypertension, ceteris paribus.

  • For those with heartdisease, the log odds of having a stroke is 0.195 times higher compared to those without heart disease, ceteris paribus.

  • For those who have heartdisease and formerly smoked, the log odds of having stroke is 0.200 units higher compared to if only the individual effects of heartdisease and never smoked were added alone, ceteris paribus.

  • For those who have heartdisease and smokes, the additional effect on the log odds of having stroke is 0.668 units higher than would be than only adding the individual effects of heart disease and neversmoked alone, ceteris paribus.

  • For individuals with a worktype of Self-employed, the log odds of having a stroke is -0.195 lower compared to those with a worktype of ‘Children’, ceteris paribus.

  • For individuals with a ‘worktype’ of Private, the log odds of having a stroke is 0.051 units higher than would be only adding the individual effects of a worktype of ‘Children’, ceteris paribus.

Predictions and Model Evaluation

## Area under the curve: 0.8528

Linear Discriminant Analaysis (LDA)

## Linear Discriminant Analysis 
## 
## 4908 samples
##    7 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4417, 4417, 4417, 4417, 4417, 4417, ... 
## Resampling results:
## 
##   ROC        Sens       Spec      
##   0.8329498  0.9778646  0.09595238

Predictions and Model Evaluation

Quadratic Discriminant Analysis (QDA)

## Quadratic Discriminant Analysis 
## 
## 4908 samples
##    7 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4417, 4417, 4417, 4419, 4417, 4417, ... 
## Resampling results:
## 
##   ROC        Sens       Spec    
##   0.8192788  0.9091335  0.387381
## QDA AUC: 0.8161092

Random Forest

Optimal mtry Tuning via Out-Of-Bag(OOB) Error

oob_results <- sapply(c(2, 3, 4, 5), function(m) {
  model <- randomForest(
    formula,
    data = complete_data,
    mtry = m,
    strata = complete_data$stroke,    # Balance class splits
    ntree = 500
  )
  return(mean(model$err.rate[, "OOB"]))  # Average OOB error
})

best_mtry <- c(2, 3, 4, 5)[which.min(oob_results)]

Fit Random Forest model with optimal feature splitting (mtry)

set.seed(123)
rf_model <- randomForest(
  formula,
  data = complete_data,
  mtry = best_mtry,  # Optimal mtry from earlier
  ntree = 1000,      # Sufficiently large to observe stabilization
  strata = complete_data$stroke,  # For class imbalance
  sampsize = rep(min(table(complete_data$stroke)), 2),  # Balanced samples
  importance = TRUE
)

# When the OOB error stabilizes, it indicates that the model has learned the data well and is not overfitting

# The below plot shows the OOB error for the "Yes" class
oob_yes <- rf_model$err.rate[, "Yes"]
plot(oob_yes, type = "l", col = "red", ylab = "OOB Error (Yes)")
abline(h = min(oob_yes), col = "blue", lty = 2)  # Add a horizontal line for the minimum OOB error

cat("Minimum OOB error:", min(oob_yes), "\n")
## Minimum OOB error: 0.1866029
rf_model <- train(
  formula, 
  data = complete_data, 
  method = "rf", 
  trControl = ctrl, 
  metric = "ROC",
  tuneGrid = data.frame(mtry = best_mtry), 
  ntree = 200  # Number of trees
)

rf_model
## Random Forest 
## 
## 4908 samples
##    7 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4417, 4417, 4417, 4418, 4418, 4417, ... 
## Resampling results:
## 
##   ROC        Sens  Spec
##   0.7541633  1     0   
## 
## Tuning parameter 'mtry' was held constant at a value of 2
cv_roc_rf <- roc(
  response = rf_model$pred$obs, 
  predictor = rf_model$pred$Yes,  # Use "Yes" or the appropriate level name
  levels = c("No", "Yes")          # Must match your factor levels
)

final_rf <- rf_model$finalModel

# Plot variable importance
varImpPlot(final_rf)

Model Performance Comparison

ROC Curve and Area under the Curve(AUC) Comparison

par(mfrow=c(1,1))
plot(cv_roc_log, col="red", main="ROC Curve Comparison")
lines(cv_roc_lasso, col="orange")
lines(cv_roc_lda, col="blue")
lines(cv_roc_qda, col="green")
lines(cv_roc_rf, col="purple")
legend("bottomright", legend=c("Logistic", "LASSO", "LDA", "QDA", "Random Forest"), col=c("red", "blue", "green", "purple"), lwd=2)

auc_values <- c(
  "Fwd_Step_Logit" = auc(cv_roc_log),
  "LASSO" = auc(cv_roc_lasso),
  "LDA" = auc(cv_roc_lda),
  "QDA" = auc(cv_roc_qda),
  "Random Forest" = auc(cv_roc_rf)
)

auc_values
## Fwd_Step_Logit          LASSO            LDA            QDA  Random Forest 
##      0.8433760      0.8527570      0.8311256      0.8161092      0.7533915
# Print model name with the highest AUC
best_model <- names(which.max(auc_values))
cat("Best Model(Highest-AUC):", best_model, "\n")
## Best Model(Highest-AUC): LASSO
# All perform similarly, feature engineering may be needed to improve classification 

Discussion: Model Performance & Insights

Which Model Performed Best?

  • LASSO had the highest ROC curve (largest AUC) : Suggests that identifying a small subset of features is beneficial for classification.

    • A sparse linear boundary was enough for classification.

    • L1 regularization effectively handles multicollinearity and can help reduce over-fitting and as it performs feature selection by shrinking some coefficients to zero.

  • Stepwise Logistic Regression had a similar ROC curve to LASSO: A simple linear boundary is sufficient for classification, meaning the stroke data is linearly separable.

  • LDA does not perform better than Logistic Regression: Although the assumption of normally distributed data holds well, but the assumption of homogenity of covariance matrices does not. LDA is not the most suitable classifier.

  • QDA neither outperforms LDA or Logistic Regression: Suggests that the relationship between features and stroke is not non-linear, and QDA’s ability to model curved decision boundaries is not advantageous.

  • Random Forest has the lowest AUC: Suggests that the model underperforms due to its inability to tackle multi-collinearity. The model is ineffective as it does not penalize redundant features and is sensitivity to noise. However, it is still useful for assessing feature importance.

  • All models perform similarly: Feature engineering or additional predictors might be needed to improve classification.

Key Takeaways: Although Random forest suggested the most important predictors of stroke are age, avgglucoselevel and bmi, LASSO favors heartdisease and hypertension predictors. The best model depends on AUC, ROC-curve, how the model performs feature selection and meets underlying assumptions. Handling class imbalance can improve performance. Pooling data across all imputed data sets would have provided better generalization and more robust standard error estimates.


Appendix

Question 2: Statistical Tests with no statistical significance
Relationship Checks (Response x Explanatory)
# Stroke x Gender
with(complete_data, CrossTable(gender, stroke, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically signficiant
# No association between stroke and gender 

# Stroke x residence type
with(complete_data, CrossTable(residencetype, stroke, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant 
# No association between stroke and residence type
Multicollinearity Checks (Explanatory x Explanatory)
# T-tests
t.test(complete_data$age ~ complete_data$residencetype, var.equal = TRUE) # p = 0.3, fail to reject H0
t.test(complete_data$bmi ~ complete_data$gender, var.equal = TRUE) # p = 0.1, fail to reject H0
t.test(complete_data$bmi ~ complete_data$residencetype, var.equal = TRUE) # p = 1, fail to reject H0
t.test(complete_data$avgglucoselevel ~ complete_data$residencetype, var.equal = TRUE) # Fail to reject H0, p = 0.7
# Chi-square
with(complete_data, CrossTable(hypertension, smokingstatus, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant

with(complete_data, CrossTable(gender, residencetype, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant

with(complete_data, CrossTable(residencetype, evermarried, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant

with(complete_data, CrossTable(gender, hypertension, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant

with(complete_data, CrossTable(residencetype, hypertension, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant

with(complete_data, CrossTable(residencetype, worktype, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant

with(complete_data, CrossTable(residencetype, heartdisease, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant

with(complete_data, CrossTable(smokingstatus, hypertension, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant
Interaction Terms with no statistical significance
fit_int <- glm(stroke ~ age*hypertension, data=complete_data, family = "binomial")
summary(fit_int)
# age:hypertension

fit_int3 <- glm(stroke ~ age*evermarried, data=complete_data, family = "binomial")
summary(fit_int3)
# age:evermarried

fit_int4 <- glm(stroke ~ age*avgglucoselevel, data=complete_data, family = "binomial")
summary(fit_int4)
# age:avgglucoselevel

fit_int6 <- glm(stroke ~ bmi*heartdisease, data=complete_data, family = "binomial")
summary(fit_int6)
# bmi:heartdisease

fit_int9 <- glm(stroke ~ avgglucoselevel*heartdisease, data=complete_data, family = "binomial")
summary(fit_int9)
# avgglucoselevel:heartdisease

fit_int10 <- glm(stroke ~ avgglucoselevel*evermarried, data=complete_data, family = "binomial")
summary(fit_int10)
# avgglucoselevel:evermarried

fit_int12 <- glm(stroke ~ worktype*heartdisease, data=complete_data, family = "binomial")
summary(fit_int12)
# worktype:heartdisease

fit_int13 <- glm(stroke ~ worktype*hypertension, data=complete_data, family = "binomial")
summary(fit_int13)
# worktype:hypertension

fit_int14 <- glm(stroke ~ worktype*smokingstatus, data=complete_data, family = "binomial")
summary(fit_int14)
# worktype:smokingstatus

fit_int15 <- glm(stroke ~ smokingstatus*evermarried, data=complete_data, family = "binomial")
summary(fit_int15)
# smokingstatus:evermarried
Three way interaction terms all with no statistical significance
# Consider three way order interactions too -> based off the two way interactions that were tested for significance 
# e.g.
# age:heartdisease:hypertension
# bmi:hypertension:evermarried
# avgglucoselevel:hypertension:heartdisease
# bmi:evermarried:hypertension
# age:heartdisease:smokingstatus 

fit_int19 <- glm(stroke ~ age*heartdisease*hypertension, data=complete_data, family = "binomial")
summary(fit_int19)

fit_int20<- glm(stroke ~ bmi*hypertension*evermarried, data=complete_data, family = "binomial")
summary(fit_int20)

fit_int21 <- glm(stroke ~ avgglucoselevel*hypertension*heartdisease, data=complete_data, family = "binomial")
summary(fit_int21)

fit_int22 <- glm(stroke ~ bmi*evermarried*hypertension, data=complete_data, family = "binomial")
summary(fit_int22)

fit_int23 <- glm(stroke ~ age*heartdisease*smokingstatus, data=complete_data, family = "binomial")
summary(fit_int23)
# All statistically Insignificant